---
title: "03_preprocess_data"
author: "Jae Yeon Kim"
date: "7/2/2020"
output: html_document
---

# Import packages and files 

## Packages

```{r}
pacman::p_load(data.table, # for fast data manipulation
               tidyverse, # for tidyverse 
               ggpubr, # for arranging ggplots   
               ggthemes, # for fancy ggplot themes
               here, # for reproducibility 
               maps, # for US city data
               openintro, # for revert state name abbreviations to their original forms 
               stringr, # for easy regular expression
               lubridate, # for easy time var manipulation
               forcats, # reverse factor order 
               patchwork, # for easy ggarrange
               ggsci, # for pubs 
               gganimate, # for animated data viz 
               ggplot2)

# devtools::install_github("jaeyk/tidytweetjson", force = TRUE)
# devtools::install_github("jaeyk/makereproducible", force = TRUE)

library(tidytweetjson)
library(makereproducible)

# for publication-friendly theme 
theme_set(theme_pubr())

# custom functions
source(here("functions", "stacked_area_plot.R"))
source(here("functions", "add_normalized.R"))

```

## Files 

- 5,050,042 obs 

```{r}
parsed <- readRDS(here("processed_data", "parsed.rds"))
```

# Wrangle 

## Filter

- Filtered if location field is empty or NA: 3,734,756 obs (73% of the original data)
- I did not use country_code field as it is mostly filled with NAs (97%).

```{r}
sum(is.na(parsed$country_code))/nrow(parsed)
filtered <- parsed %>%
    filter(!(location == "" | is.na(location)))
```

## Select 

```{r}
# Select columns 
selected <- filtered %>% 
    select(created_at, full_text, location) 
```

## Mutate 

### Location 

- 37% of the Tweets (1,394,468) were created by the users located in the US.

```{r}
# Add two columns
# US: located in the US identified by the names of the US states and cities 
# non_US: located not in the US identified by the names of the non-US countries 
df <- add_US_location(selected)
df <- subset(df, US_location == 1)
# nrow(df)/nrow(selected)
```

### Text 

```{r}
# Create new columns 
df <- df %>%
    mutate(full_text = str_to_lower(full_text),
           wuhan = as.numeric(str_detect(full_text, "wuhan")),
           asian = as.numeric(str_detect(full_text, "asia|asian")),
           chinese = as.numeric(str_detect(full_text, "china|chinese")),
           trump = as.numeric(str_detect(full_text, "trump")),
           chinese_virus = as.numeric(str_detect(full_text, "chinese flu|chineseflu|chinese virus|chinesevirus")),
           kung_flu = as.numeric(str_detect(full_text, "kung flu|kungflu")), 
           wuhan_virus = as.numeric(str_detect(full_text, "wuhan virus|wuhanvirus")),
           anti_racism = as.numeric(str_detect(full_text, "antiracism|antiasian|stophate|acttochange|stopaapihate|stophatecrimes|racism|racist|hatecrime")),
           racism = ifelse(chinese_virus == 1 | kung_flu == 1 | wuhan_virus == 1, 1, 0)
           )
           
```

### Date

```{r}
df <- tidytweetjson::add_date(df)
#df[1:10,] %>% select(created_at, date)

fwrite(df, here("processed_data", "US_tweets.csv"))
```

```{r}
df <- fread(here("processed_data", "US_tweets.csv"))
```

# Explore 

## Time-series

### Overall 

```{r}
# Google 

gtrends <- read_csv(here("processed_data", "gtrends.csv"))

gtrends <- gtrends %>% 
    mutate(keywords = recode(keywords, "Kung Flu" = "Kung flu")) %>%
    mutate(keywords = str_replace(keywords, ".*Chinese.*", "Chinese virus"),
           keywords = str_replace(keywords, ".*Racism.*", "Anti-racism")) %>%
    rename(Terms = keywords)

unique(gtrends$Terms)
```

```{r}
 
gtrends_plot <- gtrends %>%
    ggplot(aes(x = as.Date(date), y = hits)) +
        geom_line(size = 1.2) +
        labs(x = "Date",
             y = "Count",
             title = "Google Searches on COVID-19 (In the US)",
             subtitle = "Normalized to a 0-100 range") +
        geom_vline(xintercept = as.Date(c("2020-03-16")),
               linetype = "dashed",
               size = 1.2,
               color = "darkgray") +
        facet_wrap(~Terms)
```

```{r}
animated_gtrends_plot <- gtrends %>%
    ggplot(aes(x = as.Date(date), y = hits)) +
        geom_line(size = 1.2) +
        labs(x = "Date",
             y = "Count",
             title = "Google Searches on COVID-19 (In the US)",
             subtitle = "Normalized to a 0-100 range") +
        geom_vline(xintercept = as.Date(c("2020-03-16")),
             linetype = "dashed",
             size = 1.2,
             color = "blue") +
        facet_wrap(~Terms) +
        transition_reveal(date)

install.packages("gifski")

animate(animated_gtrends_plot, duration = 5, fps = 20, width = 200, height = 200, renderer = gifski_renderer())

anim_save(here("outputs", "animated_gtrends_plot.gif"))
```

```{r}
# Twitter 
## Group by date then visualize 

twitter <- bind_rows(
    mutate(add_normalized(df, chinese_virus), Terms = "Chinese virus"),
    mutate(add_normalized(df, kung_flu), Terms = "Kung flu"),
    mutate(add_normalized(df, wuhan_virus), Terms = "Wuhan virus"),
    mutate(add_normalized(df, anti_racism), Terms = "Anti-racism")
)
             
twitter_plot <- twitter %>%
    ggplot(aes(x = as.Date(date), y = rescaled)) +
        geom_line(size = 1.2) +
        labs(x = "Date",
             y = "Count",
             title = "Tweets on COVID-19 (in the US)",
             subtitle = "Normalized to a 0-100 range") +
        geom_vline(xintercept = as.Date(c("2020-03-16")),
               linetype = "dashed",
               size = 1.2,
               color = "darkgray") +
        facet_wrap(~Terms)
```

```{r}
twitter_plot / gtrends_plot
ggsave(here("outputs", "overall_trend.png"), height = 10, width = 10)
```

### Keywords 

```{r}
# df <- data.table::fread(here("processed_data", "text_ready.csv"))

p1 <- stacked_area_plot(df, wuhan) +
        labs(x = "Date",
             y = "Proportion",
             fill = "Wuhan",
             title = "Wuhan",
             subtitle = "Tweets created by the users located in the US")

p2 <- stacked_area_plot(df, chinese) +
        labs(x = "Date",
             y = "Proportion",
             fill = "Chinese",
             title = "Chinese",
             subtitle = "Tweets created by the users located in the US")

p3 <- stacked_area_plot(df, asian) +
        labs(x = "Date",
             y = "Proportion",
             fill = "Asian",
             title = "Asian",
             subtitle = "Tweets created by the users located in the US")

p4 <- stacked_area_plot(df, chinese_virus) +
        labs(x = "Date",
             y = "Proportion",
             fill = "Chinese virus",
             title = "Chinese virus",
             subtitle = "Tweets created by the users located in the US")

p5 <- stacked_area_plot(df, kung_flu) +
        labs(x = "Date",
             y = "Proportion",
             fill = "Kung flu",
             title = "Kung flu",
             subtitle = "Tweets created by the users located in the US")

p6 <- stacked_area_plot(df, wuhan_virus) +
        labs(x = "Date",
             y = "Proportion",
             fill = "Wuhan virus",
             title = "Wuhan virus",
             subtitle = "Tweets created by the users located in the US")

(p1 + p2) / (p3 + p4) / (p5 + p6)

ggsave(here("outputs", "stacked_bar_plots2.png"),
       height = 10, width = 9)
      
df$virus <- ifelse(df$wuhan_virus == 1 | df$chinese_virus == 1 | df$kung_flu == 1, 1, 0)

sum(df$virus)/nrow(df)


(sum(df$asia)/nrow(df))/(sum(df$virus)/nrow(df))
```
# Export 

```{r}
fwrite(df, here("processed_data", "text_ready.csv"))
```